{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.types import *\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import Column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc=SparkContext(appName=\"pg_dataframe\") \n",
    "spark=SparkSession.builder.appName('pg_dataframe').master(\"Yct201811021847\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_df_from_postgres():\n",
    "    df=spark.read.format('jdbc').options(\n",
    "        url='jdbc:postgresql://localhost:5432/northwind',\n",
    "        dbtable='public.orders',\n",
    "        user='postgres',\n",
    "        password='iamroot'\n",
    "    ).load()\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=create_df_from_postgres()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----------+----------+----------+------------+-----------+-------+-------+--------------------+------------------+--------+----------+--------------+-----------+\n",
      "|orderid|customerid|employeeid| orderdate|requireddate|shippeddate|shipvia|freight|            shipname|       shipaddress|shipcity|shipregion|shippostalcode|shipcountry|\n",
      "+-------+----------+----------+----------+------------+-----------+-------+-------+--------------------+------------------+--------+----------+--------------+-----------+\n",
      "|  10248|     VINET|         5|1996-07-04|  1996-08-01| 1996-07-16|      3|  32.38|Vins et alcools C...|59 rue de l'Abbaye|   Reims|      null|         51100|     France|\n",
      "|  10249|     TOMSP|         6|1996-07-05|  1996-08-16| 1996-07-10|      1|  11.61|  Toms Spezialitäten|     Luisenstr. 48| Münster|      null|         44087|    Germany|\n",
      "+-------+----------+----------+----------+------------+-----------+-------+-------+--------------------+------------------+--------+----------+--------------+-----------+\n",
      "only showing top 2 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.show(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.session.SparkSession"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(spark)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- orderid: short (nullable = true)\n",
      " |-- customerid: string (nullable = true)\n",
      " |-- employeeid: short (nullable = true)\n",
      " |-- orderdate: date (nullable = true)\n",
      " |-- requireddate: date (nullable = true)\n",
      " |-- shippeddate: date (nullable = true)\n",
      " |-- shipvia: short (nullable = true)\n",
      " |-- freight: float (nullable = true)\n",
      " |-- shipname: string (nullable = true)\n",
      " |-- shipaddress: string (nullable = true)\n",
      " |-- shipcity: string (nullable = true)\n",
      " |-- shipregion: string (nullable = true)\n",
      " |-- shippostalcode: string (nullable = true)\n",
      " |-- shipcountry: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.dataframe.DataFrame"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------------+----------+------------------+------------------+------------------+--------------------+--------------------+--------+----------+------------------+-----------+\n",
      "|summary|          orderid|customerid|        employeeid|           shipvia|           freight|            shipname|         shipaddress|shipcity|shipregion|    shippostalcode|shipcountry|\n",
      "+-------+-----------------+----------+------------------+------------------+------------------+--------------------+--------------------+--------+----------+------------------+-----------+\n",
      "|  count|              830|       830|               830|               830|               830|                 830|                 830|     830|       323|               811|        830|\n",
      "|   mean|          10662.5|      null| 4.403614457831325|2.0072289156626506|   78.244204877492|                null|                null|    null|      null|39975.067357512955|       null|\n",
      "| stddev|239.7446558319914|      null|2.4996475400823885|0.7796852435554511|116.77929406259489|                null|                null|    null|      null| 36316.43627991753|       null|\n",
      "|    min|            10248|     ALFKI|                 1|                 1|              0.02|Alfred's Futterkiste|1 rue Alsace-Lorr...|  Aachen|        AK|            01-012|  Argentina|\n",
      "|    25%|            10455|      null|                 2|                 1|             13.37|                null|                null|    null|      null|            5022.0|       null|\n",
      "|    50%|            10662|      null|                 4|                 2|             41.34|                null|                null|    null|      null|           28023.0|       null|\n",
      "|    75%|            10870|      null|                 7|                 3|             91.48|                null|                null|    null|      null|           80805.0|       null|\n",
      "|    max|            11077|     WOLZA|                 9|                 3|           1007.64|       Wolski Zajazd|        Åkergatan 24|   Århus|        WY|           WX3 6FW|  Venezuela|\n",
      "+-------+-----------------+----------+------------------+------------------+------------------+--------------------+--------------------+--------+----------+------------------+-----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.summary().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+------------------+\n",
      "|summary|            shipname|           freight|\n",
      "+-------+--------------------+------------------+\n",
      "|  count|                 830|               830|\n",
      "|   mean|                null|   78.244204877492|\n",
      "| stddev|                null|116.77929406259489|\n",
      "|    min|Alfred's Futterkiste|              0.02|\n",
      "|    max|       Wolski Zajazd|           1007.64|\n",
      "+-------+--------------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.describe(['shipname','freight']).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "cust_country=data.groupBy(['customerid','shipcountry'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.group.GroupedData"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(cust_country)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-----------+------------------+------------------+\n",
      "|customerid|shipcountry|count(shipcountry)|      avg(freight)|\n",
      "+----------+-----------+------------------+------------------+\n",
      "|     SAVEA|        USA|                31|215.60322700008268|\n",
      "|     ERNSH|    Austria|                30|206.84633391698202|\n",
      "|     QUICK|    Germany|                28| 200.2010712495872|\n",
      "|     FOLKO|     Sweden|                19|   88.320000121468|\n",
      "|     HUNGO|    Ireland|                19|145.01262895684494|\n",
      "|     RATTC|        USA|                18|118.56722169452243|\n",
      "|     HILAA|  Venezuela|                18| 69.95333398713007|\n",
      "|     BERGS|     Sweden|                18| 86.64000034332275|\n",
      "|     BONAP|     France|                17| 79.87470626831055|\n",
      "|     FRANK|    Germany|                15|  93.5626670519511|\n",
      "|     LEHMS|    Germany|                15| 67.80199979146322|\n",
      "|     WARTH|    Finland|                15| 54.83200004498164|\n",
      "|     KOENE|    Germany|                14| 58.12000067319189|\n",
      "|     LILAS|  Venezuela|                14|  52.4578571186534|\n",
      "|     WHITC|        USA|                14|  96.6471427849361|\n",
      "|     LAMAI|     France|                14| 45.41571426817349|\n",
      "|     HANAR|     Brazil|                14|51.769285917282104|\n",
      "|     BOTTM|     Canada|                14| 56.71071410179138|\n",
      "|     MEREP|     Canada|                13|107.24769166799692|\n",
      "|     AROUT|         UK|                13| 36.30384690944965|\n",
      "+----------+-----------+------------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cust_country.agg({'shipcountry':'count','freight':'avg'}).orderBy(['count(shipcountry)'],ascending=False).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-------+--------------------+------------+---------------+------------+------------------+\n",
      "|customerid|freight|            shipname|avg(orderid)|avg(employeeid)|avg(shipvia)|      avg(freight)|\n",
      "+----------+-------+--------------------+------------+---------------+------------+------------------+\n",
      "|     BSBEV|  22.77|       B's Beverages|     10289.0|            7.0|         3.0|22.770000457763672|\n",
      "|     RICSU| 130.79|  Richter Supermarkt|     10751.0|            3.0|         3.0| 130.7899932861328|\n",
      "|     DRACD|  79.25|Drachenblut Delik...|     10825.0|            1.0|         1.0|             79.25|\n",
      "|     RICSU|  30.85|  Richter Supermarkt|     10951.0|            9.0|         2.0|30.850000381469727|\n",
      "|      null| 147.26|                null|     10294.0|            4.0|         2.0|147.25999450683594|\n",
      "|      null|  17.68|Magazzini Aliment...|     10300.0|            2.0|         2.0| 17.68000030517578|\n",
      "|     GODOS| 107.83|                null|     10303.0|            7.0|         2.0|107.83000183105469|\n",
      "|     LONEP|   0.56|                null|     10307.0|            2.0|         2.0|0.5600000023841858|\n",
      "|      null|  47.42|                null|     10389.0|            4.0|         2.0| 47.41999816894531|\n",
      "|     SAVEA|  89.16|                null|     10398.0|            2.0|         3.0| 89.16000366210938|\n",
      "|      null| 370.61|                null|     10424.0|            7.0|         2.0| 370.6099853515625|\n",
      "|     BSBEV|   4.87|                null|     10538.0|            9.0|         3.0| 4.869999885559082|\n",
      "|      null|  55.92|        Vaffeljernet|     10591.0|            1.0|         1.0| 55.91999816894531|\n",
      "|      null| 105.81|                null|     10659.0|            7.0|         2.0|105.80999755859375|\n",
      "|      null|   1.27|Furia Bacalhau e ...|     10664.0|            1.0|         3.0|1.2699999809265137|\n",
      "|     QUEDE|   9.53|                null|     10720.0|            8.0|         2.0| 9.529999732971191|\n",
      "|     VICTE|  22.11|                null|     10806.0|            3.0|         2.0|22.110000610351562|\n",
      "|      null|   2.71|                null|     10840.0|            4.0|         2.0|2.7100000381469727|\n",
      "|      null|  52.51|                null|     10858.0|            2.0|         1.0|  52.5099983215332|\n",
      "|     BERGS|  32.37|                null|     10875.0|            4.0|         2.0|32.369998931884766|\n",
      "+----------+-------+--------------------+------------+---------------+------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cust_country.avg().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "d1=data.cube(['customerid','freight','shipcountry'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
